{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This code was used to collect YouTube data regarding the top creators from the YouTube Data API; TikTok creator data was collected using the front-end collection tool zeeschuimer: https://github.com/digitalmethodsinitiative/zeeschuimer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import Libraries\n",
    "import pandas as pd\n",
    "import requests\n",
    "import json\n",
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import API Keys\n",
    "# Load the configuration from the JSON file\n",
    "with open('config.json', 'r') as config_file:\n",
    "    config = json.load(config_file)\n",
    "\n",
    "# Access the API keys string\n",
    "api_keys_str = config.get('api_keys', '')\n",
    "\n",
    "# Split the string into an array\n",
    "api_keys = api_keys_str.split(',')\n",
    "api_keys = [key.strip() for key in api_keys]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(api_keys), api_keys)\n",
    "current_api_key = api_keys[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fetch Video Details"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_channel_description(channel_id):\n",
    "   params = {\n",
    "   'part': 'snippet',\n",
    "   'id': channel_id,\n",
    "   'key': current_api_key\n",
    "   }\n",
    "\n",
    "   url = 'https://youtube.googleapis.com/youtube/v3/channels'\n",
    "   response_raw = requests.get(url, params=params)\n",
    "   if response_raw.status_code == 200:\n",
    "      response_json = response_raw.json()\n",
    "      if('items' in response_json):\n",
    "         response = response_json[\"items\"][0]['snippet']\n",
    "         channel_title = response.get('title', '')\n",
    "         channel_description = response.get('description', '')\n",
    "         channel_url = f'https://www.youtube.com/{response.get(\"customUrl\", \"\")}'\n",
    "         channel_publishedAt = response.get('publishedAt', '')\n",
    "         channel_thumbnails = {'default': response['thumbnails']['default'].get('url', ''), 'medium': response['thumbnails']['medium'].get('url', ''), 'high': response['thumbnails']['high'].get('url', '')}\n",
    "         channel_country = response.get('country', '')\n",
    "         return {'channel_title': channel_title, 'channel_description': channel_description, 'channel_url': channel_url, 'channel_publishedAt': channel_publishedAt, 'channel_thumbnails': channel_thumbnails, 'channel_country': channel_country}\n",
    "   return 'Error'\n",
    "\n",
    "def get_channel_videos(channel_id):\n",
    "   video_list = []\n",
    "   pageToken = None\n",
    "   calls = 0 \n",
    "   while True:\n",
    "      params = {\n",
    "      'part': 'snippet',\n",
    "      'channelId': channel_id,\n",
    "      'key': current_api_key,\n",
    "      'maxResults': 50,\n",
    "      'publishedAfter':'2012-01-01T00:00:00Z',\n",
    "      'publishedBefore':'2023-12-31T23:59:59Z',\n",
    "      'order': 'date',\n",
    "      'pageToken': pageToken\n",
    "      }\n",
    "      \n",
    "      url = 'https://youtube.googleapis.com/youtube/v3/search'\n",
    "      response = requests.get(url, params=params)\n",
    "      calls += 1\n",
    "      if response.status_code == 200:\n",
    "         response_json = response.json()\n",
    "         print(pageToken, response_json['pageInfo']['totalResults'])\n",
    "         if('nextPageToken' in response_json):\n",
    "            pageToken = response_json['nextPageToken']\n",
    "         else:\n",
    "            pageToken = None\n",
    "         if('items' in response_json):\n",
    "            for item in response_json['items']:\n",
    "               result = {}\n",
    "               result['videoId'] = item['id'].get('videoId', '')\n",
    "               viewCount, likeCount, commentCount = get_channel_video_stats(result['videoId'])\n",
    "               result['videoURL'] = f'https://www.youtube.com/watch?v={result[\"videoId\"]}'\n",
    "               result['viewCount'] = viewCount\n",
    "               result['likeCount'] = likeCount\n",
    "               result['commentCount'] = commentCount\n",
    "               result['publishedAt'] = item['snippet'].get('publishedAt', '')\n",
    "               result['title'] = item['snippet'].get('title', '')\n",
    "               result['description'] = item['snippet'].get('description', '')\n",
    "               result['thumbnails'] = {'default': item['snippet']['thumbnails']['default'].get('url', ''), 'medium': item['snippet']['thumbnails']['medium'].get('url', ''), 'high': item['snippet']['thumbnails']['high'].get('url', '')}\n",
    "               video_list.append(result)\n",
    "            \n",
    "            if(not pageToken):\n",
    "               break\n",
    "      else: \n",
    "         return video_list\n",
    "   return video_list, calls\n",
    "\n",
    "def get_channel_video_stats(video_id):\n",
    "   params = {\n",
    "    'part': 'statistics',\n",
    "    'id': video_id,\n",
    "    'key': current_api_key,\n",
    "    }\n",
    "   url = 'https://youtube.googleapis.com/youtube/v3/videos'\n",
    "   response = requests.get(url, params=params)\n",
    "   if response.status_code == 200:\n",
    "      try:\n",
    "         response_json = response.json()\n",
    "         statistics = response_json['items'][0]['statistics']\n",
    "      except:\n",
    "         return '', '', ''\n",
    "\n",
    "      view_count = statistics.get('viewCount', '')\n",
    "      like_count = statistics.get('likeCount', '')\n",
    "      comment_count = statistics.get('commentCount', '')\n",
    "\n",
    "      return view_count, like_count, comment_count\n",
    "   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Update Json & CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ids = [\n",
    "'UCcvNYxWXR_5TjVK7cSCdW-g',\n",
    "'UCdgDIIKpFlpHB1L0LZnh5EQ',\n",
    "'UCe4LM_eKc9ywRmVuBm5pjQg'\n",
    "]\n",
    "for channel_id in ids:\n",
    "    print(channel_id)\n",
    "print(len(ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def update_json(channel_id, channel_details):\n",
    "    json_file_path = 'final_data.json'\n",
    "    # Read data from the JSON file\n",
    "    with open(json_file_path, 'r') as file:\n",
    "        data = json.load(file)\n",
    "    data[channel_id] = channel_details\n",
    "\n",
    "    # Write data to the JSON file\n",
    "    with open(json_file_path, 'w') as file:\n",
    "        json.dump(data, file)\n",
    "\n",
    "def update_csv(channel_id, channel_details):\n",
    "    df = pd.read_csv('final_data_4.csv')\n",
    "    for videos in channel_details['videos']:\n",
    "        data = [channel_id, channel_details['channel_title'], channel_details['channel_description'], channel_details['channel_url'], channel_details['channel_publishedAt'], channel_details['channel_thumbnails']['default'], channel_details['channel_thumbnails']['medium'], channel_details['channel_thumbnails']['high'], channel_details['channel_country']]\n",
    "        data.extend([videos['videoId'], videos['videoURL'], videos['viewCount'], videos['likeCount'], videos['commentCount'], videos['publishedAt'], videos['title'], videos['description'], videos['thumbnails']['default'], videos['thumbnails']['medium'], videos['thumbnails']['high']])\n",
    "        df.loc[len(df)] = data\n",
    "    df.to_csv('final_data_4.csv', header=True, index=False)\n",
    "\n",
    "# Fetch channel details \n",
    "for channel_id in ids:\n",
    "    channel_details = get_channel_description(channel_id)\n",
    "    channel_videos, calls = get_channel_videos(channel_id)\n",
    "    channel_details['videos'] = channel_videos\n",
    "    update_json(channel_id, channel_details)\n",
    "    update_csv(channel_id, channel_details)\n",
    "    print(calls)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('final_data_4.csv')\n",
    "print(len(df.columns))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for index, row in df.iterrows():\n",
    "    params = {\n",
    "    'part': 'statistics',\n",
    "    'id': row['Channel_id'],\n",
    "    'key': api_keys[0],\n",
    "    }\n",
    "    url = 'https://youtube.googleapis.com/youtube/v3/channels'\n",
    "    response = requests.get(url, params=params)\n",
    "    if response.status_code == 200:\n",
    "        response_json = response.json()\n",
    "        df.at[index, 'viewCount'] = response_json['items'][0]['statistics']['viewCount']\n",
    "        df.at[index, 'subscriberCount'] = response_json['items'][0]['statistics']['subscriberCount']\n",
    "        df.at[index, 'videoCount'] = response_json['items'][0]['statistics']['videoCount']\n",
    "    else:\n",
    "        continue"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get the remaning stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_path = 'video_ids_final_0.csv'\n",
    "\n",
    "with open(file_path, 'r') as csv_file:\n",
    "    csv_reader = csv.reader(csv_file)\n",
    "    all_rows = [row for row in csv_reader]\n",
    "\n",
    "start_row = 25000\n",
    "end_row = len(all_rows)\n",
    "\n",
    "for i in range(start_row, end_row):\n",
    "    all_rows[i][1], all_rows[i][2], all_rows[i][3] = get_channel_video_stats(all_rows[i][0])\n",
    "\n",
    "with open(file_path, 'w', newline='') as csv_output:\n",
    "    csv_writer = csv.writer(csv_output)\n",
    "    csv_writer.writerows(all_rows) \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Merge to priority list_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_0 = pd.read_csv('final_data_0.csv')\n",
    "df_1 = pd.read_csv('final_data_1.csv')\n",
    "df_2 = pd.read_csv('final_data_2.csv')\n",
    "df_3 = pd.read_csv('final_data_3.csv')\n",
    "df_4 = pd.read_csv('final_data_4.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_result = pd.concat([df_0, df_1, df_2, df_3], ignore_index=True)\n",
    "print(len(df_result))\n",
    "df_result.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_df = pd.read_csv('channel_groupby_id.csv')\n",
    "list_df = list_df[(list_df['Priority'] == 2) | (list_df['Priority'] == 3)]\n",
    "priority_channels = list_df['Channel_id'].tolist()\n",
    "print(len(priority_channels),priority_channels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_result = df_result[df_result['channel_id'].isin(priority_channels)]\n",
    "print(len(df_result))\n",
    "df_result.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def transform_value(value):\n",
    "    # Example transformation, you can replace this with your own logic\n",
    "    return f'https://img.youtube.com/vi/{value}/maxresdefault.jpg'\n",
    "\n",
    "# Apply the function to create a new column 'B'\n",
    "df_result['thumbnail_maxres'] = df_result['videoId'].apply(transform_value)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(df_result))\n",
    "df_result.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_250_on = pd.read_csv('priority_2-3_channels_index_250_on.csv')\n",
    "df_final_result = pd.concat([df_result, df_250_on], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(df_final_result))\n",
    "df_final_result.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_final_result.to_csv('priority_list_2-3.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_final_result['thumbnail_maxres'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_trending_videos():\n",
    "      video_list = []\n",
    "      pageToken = None\n",
    "      while True:\n",
    "         params = {\n",
    "         'part': 'snippet',\n",
    "         'key': current_api_key,\n",
    "         'chart':'mostPopular',\n",
    "         'maxResults': 50,\n",
    "         'pageToken': pageToken\n",
    "         }\n",
    "         \n",
    "         url = 'https://youtube.googleapis.com/youtube/v3/videos'\n",
    "         response = requests.get(url, params=params)\n",
    "         if response.status_code == 200:\n",
    "            response_json = response.json()\n",
    "            if('nextPageToken' in response_json):\n",
    "               pageToken = response_json['nextPageToken']\n",
    "            else:\n",
    "               pageToken = None\n",
    "            if('items' in response_json):\n",
    "               for item in response_json['items']:\n",
    "                  result = {}\n",
    "                  result['videoId'] = item['id']\n",
    "                  viewCount, likeCount, commentCount = get_channel_video_stats(result['videoId'])\n",
    "                  result['videoURL'] = f'https://www.youtube.com/watch?v={result[\"videoId\"]}'\n",
    "                  result['viewCount'] = viewCount\n",
    "                  result['likeCount'] = likeCount\n",
    "                  result['commentCount'] = commentCount\n",
    "                  result['channelTitle'] = item['snippet'].get('channelTitle', '')\n",
    "                  result['channelId'] = item['snippet'].get('channelId', '')\n",
    "                  result['publishedAt'] = item['snippet'].get('publishedAt', '')\n",
    "                  result['title'] = item['snippet'].get('title', '')\n",
    "                  result['description'] = item['snippet'].get('description', '')\n",
    "                  result['thumbnails'] = {'default': item['snippet']['thumbnails']['default'].get('url', ''), 'medium': item['snippet']['thumbnails']['medium'].get('url', ''), 'high': item['snippet']['thumbnails']['high'].get('url', '')}\n",
    "                  video_list.append(result)\n",
    "               \n",
    "               if(not pageToken):\n",
    "                  break\n",
    "      return video_list\n",
    "\n",
    "trending_videos = get_trending_videos()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = []\n",
    "for videos in trending_videos:\n",
    "    data.append([videos['videoId'], videos['videoURL'],videos['viewCount'],videos['likeCount'],videos['commentCount'],\n",
    "                 videos['channelTitle'],videos['channelId'], videos['publishedAt'],videos['title'],\n",
    "                 videos['description'],videos['thumbnails']['default'], \n",
    "                 videos['thumbnails']['medium'], videos['thumbnails']['high'] , \n",
    "                 f'https://img.youtube.com/vi/{videos[\"videoId\"]}/maxresdefault.jpg'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"trending_videos.csv\",\"a\") as my_csv:\n",
    "    csvWriter = csv.writer(my_csv,delimiter=',')\n",
    "    csvWriter.writerows(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('trending_videos.csv')\n",
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(pd.unique(df['1kETt59yn6A']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_top_videos():\n",
    "   video_list = []\n",
    "   pageToken = None\n",
    "   count = 0\n",
    "   while True:\n",
    "      count +=1\n",
    "      if(count == 10):\n",
    "         return video_list\n",
    "      params = {\n",
    "      'part': 'snippet',\n",
    "      'key': current_api_key,\n",
    "      'order':'viewCount',\n",
    "      'maxResults': 50,\n",
    "      'type': 'video'\n",
    "      }\n",
    "      \n",
    "      url = 'https://youtube.googleapis.com/youtube/v3/search'\n",
    "      response = requests.get(url, params=params)\n",
    "      if response.status_code == 200:\n",
    "         response_json = response.json()\n",
    "         if('nextPageToken' in response_json):\n",
    "            pageToken = response_json['nextPageToken']\n",
    "         else:\n",
    "            pageToken = None\n",
    "         if('items' in response_json):\n",
    "            for item in response_json['items']:\n",
    "               result = {}\n",
    "               result['videoId'] = item['id']['videoId']\n",
    "               viewCount, likeCount, commentCount = get_channel_video_stats(result['videoId'])\n",
    "               result['videoURL'] = f'https://www.youtube.com/watch?v={result[\"videoId\"]}'\n",
    "               result['viewCount'] = viewCount\n",
    "               result['likeCount'] = likeCount\n",
    "               result['commentCount'] = commentCount\n",
    "               result['channelTitle'] = item['snippet'].get('channelTitle', '')\n",
    "               result['channelId'] = item['snippet'].get('channelId', '')\n",
    "               result['publishedAt'] = item['snippet'].get('publishedAt', '')\n",
    "               result['title'] = item['snippet'].get('title', '')\n",
    "               result['description'] = item['snippet'].get('description', '')\n",
    "               result['thumbnails'] = {'default': item['snippet']['thumbnails']['default'].get('url', ''), 'medium': item['snippet']['thumbnails']['medium'].get('url', ''), 'high': item['snippet']['thumbnails']['high'].get('url', '')}\n",
    "               video_list.append(result)\n",
    "            \n",
    "            if(not pageToken):\n",
    "               break\n",
    "   return video_list\n",
    "\n",
    "top_videos = get_top_videos()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = []\n",
    "for videos in top_videos:\n",
    "    data.append([videos['videoId'], videos['videoURL'],videos['viewCount'],videos['likeCount'],videos['commentCount'],\n",
    "                 videos['channelTitle'],videos['channelId'], videos['publishedAt'],videos['title'],\n",
    "                 videos['description'],videos['thumbnails']['default'], \n",
    "                 videos['thumbnails']['medium'], videos['thumbnails']['high'] , \n",
    "                 f'https://img.youtube.com/vi/{videos[\"videoId\"]}/maxresdefault.jpg'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"top_videos.csv\",\"a\") as my_csv:\n",
    "    csvWriter = csv.writer(my_csv,delimiter=',')\n",
    "    csvWriter.writerows(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
